# Calculate/load Differential Expression metrics for all genes, load SFARI dataset:
# Gene expression data
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')
# SFARI genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
mutate('ID' = ensembl_gene_id) %>%
dplyr::select(ID, `gene-score`, syndromic)
# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345') & !is.na(datMeta$Dx)
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
if(!file.exists('./../working_data/genes_ASD_DE_info.csv')) {
# Calculate differential expression for ASD
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
mutate('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID')
write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info.csv')
rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info.csv')
}
rm(to_keep, datSeq, datProbes)
Genes with an autism score don’t seem to have better behaviour than the rest of the genes:
None of the genes with score=1 have an adjusted p-value lower than 0.05, so they are all lost when filtering the data (!)
Only a subset of 1/4 of the original number of genes without score is plotted so the plot is not that havy
genes_DE_info = genes_DE_info %>% mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))
# Create subsample of genes without score so plot is not that heavy
genes_DE_info_score = genes_DE_info %>% filter(`gene-score`!='None')
genes_DE_info_sample = genes_DE_info %>% filter(`gene-score`=='None') %>%
sample_frac(0.25) %>% bind_rows(genes_DE_info_score)
ggp = ggplotly(genes_DE_info_sample %>% ggplot(aes(adj.P.Val, logFC, color=`gene-score`)) +
geom_point(aes(ids=ID, alpha=`gene-score`)) + scale_alpha_discrete(range=c(1, 0.2)) +
geom_vline(aes(xintercept=0.03), color='#666666') + scale_color_manual(values=gg_colour_hue(7)) +
theme_minimal())
rangeslider(ggp, start=0, end=1.05)
rm(genes_DE_info_score, genes_DE_info_sample, ggp)
Score=1 has consistently the lowest logFC
The median seems to increase as the score increases as well as the variance
ggplotly(genes_DE_info %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() +
scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
Looks OK…
top_gene_id = genes_DE_info$ID[which.max(abs(genes_DE_info$logFC))]
bottom_gene_id = genes_DE_info$ID[which.min(abs(genes_DE_info$logFC))]
top_bottom_gene_expr = data.frame('Expr_top'=datExpr[rownames(datExpr)==top_gene_id,],
'Expr_bottom'=datExpr[rownames(datExpr)==bottom_gene_id,],
'Diagnosis'=datMeta$Diagnosis)
p1 = top_bottom_gene_expr %>% ggplot(aes(Expr_bottom, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = top_bottom_gene_expr %>% ggplot(aes(Expr_top, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
rm(top_gene_id, bottom_gene_id, top_bottom_gene_expr, p1, p2)
Same as with the boxplots for log fold change
make_ASD_vs_CTL_df = function(datExpr){
datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))
ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
left_join(SFARI_genes, by='ID') %>%
dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))
return(ASD_vs_CTL)
}
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + geom_boxplot() +
scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal()
# ASD_vs_CTL %>% ggplot(aes(`gene-score`, sd_diff, fill=`gene-score`)) + geom_boxplot() +
# scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal()
Score 1 outlier has ID = ENSG00000136535 (TBR1), it’s the same one that almost made the cut in the scatter plot
ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) +
geom_density(alpha=0.3) + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(7)) +
scale_color_manual(values=gg_colour_hue(7)))
Score 1 outlier is the same one as in the previous plot
ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(sd_diff), color=`gene-score`, fill=`gene-score`)) +
geom_density(alpha=0.3) + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(7)) +
scale_color_manual(values=gg_colour_hue(7)))
Loading raw data, modifying metadata and SFARI genes manually and filtering genes
# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columnd in datExpr don\'t match the rowd in datMeta!')
}
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
#################################################################################
# FILTERS:
# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 3. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=ensembl_gene_id) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
datExpr_backup = datExpr
rm(getinfo, to_keep, gene_names, mart)
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
Genes labeled with Score 1 and 2 are the most affected, now their medians are just above the median of the genes without a score, as well as their 3rd quantile. (Does this make sense? Log2 is a monotonic transformation, why would it affect different distributions in a non monotonic way?)
ASD_vs_CTL = make_ASD_vs_CTL_df(log2(datExpr_backup+1))
ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
Now both the median and the 3rd quantile of the genes with score 1 and 2 are below the general median of the data without score
datExpr = datExpr_backup # Should be the same, but in case code is ran in different order...
# Check datProbes and datExpr have same rows
if(!all(rownames(datExpr) == rownames(datProbes))){
print('Rows in datExpr don\'t match the ones in datProbes!')
}
# Conditional Quantile Normalisation (CQN)
cqn.dat = cqn(datExpr, lengths = datProbes$length,
x = datProbes$percentage_gc_content,
lengthMethod = 'smooth',
sqn = FALSE) # Run cqn with specified depths and with no quantile normalisation
datExpr = cqn.dat$y + cqn.dat$offset # Get the log2(Normalised RPKM) values
Scores 1 and 2 now have the same median as the genes without score and a lower 3rd quantile
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
# Filter out genes with low counts (filter 43406)
pres = apply(datExpr>1, 1, sum)
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]
datExpr_post_Norm = datExpr
Genes with score 1 and 2 now have lower medians and 3rd quantiles than the genes withouth a score
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
datExpr = datExpr_backup
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_+Region)
#Estimate size factors
dds = estimateSizeFactors(dds)
# Plot column sums according to size factor
# plot(sizeFactors(dds), colSums(counts(dds)))
# abline(lm(colSums(counts(dds)) ~ sizeFactors(dds) + 0))
datExpr = log2(counts(dds, normalized=TRUE) + 1)
# Filter out genes with low counts (filter 35303)
pres = apply(datExpr>1, 1, sum)
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]
Same result as before …
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
vst_output = vst(dds)
datExpr = assay(vst_output)
# Filter out genes with low counts (filter 0)
pres = apply(datExpr>1, 1, sum)
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]
A bit better, at least the median and 3rd quantiles of genes with score are higher than genes without a score, but the distinction was better before performing the log2 transformation
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())
Before:
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_backup)
## Warning: Column `ID` joining factor and character vector, coercing into
## character vector
ggp = ggplotly(ASD_vs_CTL %>% filter(`gene-score`!='None') %>%
ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) +
geom_density(alpha=0.3) + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(7)) +
scale_color_manual(values=gg_colour_hue(7)))
rangeslider(ggp, start=0, end=6000)
After:
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_post_Norm)
ggp = ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) +
geom_density(alpha=0.3) + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(7)) +
scale_color_manual(values=gg_colour_hue(7)))
rangeslider(ggp, start=0, end=3)